% updated 12/20/2016
% with alternative specification of competition adjacency matrix

% model 1: Y = lambda*A*Y + rho*B*Y+ X*b + error
%% CONTROL PANEL
%clear
parentpath = cd(cd('..'));

warning('ON','msg:id')

dataset = 'cati_merged_sdc_rnd'; % Choose from: {'cati','sdc_rnd','sdc_rnd_ctt','cati_merged_sdc_rnd','cati_merged_sdc_rnd_ctt'}
rndlink = 0; % 1 if link duration is random, 0 if link duration is fixed
lnkyear = 5; % Choose from: {3,4,5,6,7}.
sicdgit = 1; % Choose from: {1,10,100,1000}

%compmat = 1; % if use the compustat competition matrix
%compmat = 2; % if use the orbis competition matrix
%compmat = 3; % if use the hoberg_phillips competition matrix

timelag = 1; % time lag of RND stock

fac = 1e3; % scale factor for output

yr1 = 1966;
yr2 = 2006;

% drop the first t0 years (i.e. the starting year is (yr1+t0))
t0 = 1;
%%
switch dataset
    case 'cati'
        n = 11093;
    case 'sdc_rnd'
        n = 12253;
    case 'sdc_rnd_ctt'
        n = 21387;
    case 'cati_merged_sdc_rnd'
        n = 19654;
    case 'cati_merged_sdc_rnd_ctt'
        n = 27147;
    otherwise
        disp('No appropriate dataset specified.')
end
T = yr2-yr1+1;
%%
ID = zeros(n,2);
ID(:,1) = (1:n)';
for s = 1:T
    yr = yr1+s-1;
    file = [parentpath '/' dataset '/balance_sheets/' int2str(yr) '_ID SIC SALES PROFIT RND EMPL FIXED_CAPITAL TOT_COST.csv'];
    flid = fopen(file);
    dat0 = textscan(flid,'%s %s %s %s %s %s %s %s','delimiter',';','headerlines',1);
    fclose(flid);
    %---------------------------------   
    FID = str2double(strtrim(dat0{1})); % FID
    SIC = str2double(strtrim(dat0{2})); % SIC
    FID(isnan(FID)) = 0;
    SIC(isnan(SIC)) = 0;
    SIC(floor(SIC/1000)==0) = 0; % drop firms with less than 4-digit SIC
    
    for i = 1:n
        if SIC(FID==i) > 0
            if SIC(FID==i) ~= ID(i,2)
                if ID(i,2) == 0
                    ID(i,2) = SIC(FID==i);
                else
                    ID(i,2) = -1;
                    %warning('msg:id','SIC code changes over time')
                end
            end
        end
    end
end
ID(:,2) = floor(ID(:,2)/sicdgit);
ID = ID(ID(:,2)>0,:); % drop firms with missing SIC
n = size(ID,1);

file = [parentpath '/' dataset '/location/' 'ID NATION_ID NATION CITY LOC_X LOC_Y.csv'];
flid = fopen(file);
dat0 = textscan(flid,'%s %s %s %s %s %s','delimiter',';','headerlines',1);
fclose(flid);
tmp1 = str2double(strtrim(dat0{1}));
tmp2 = str2double(strtrim(dat0{2}));

loc = zeros(size(ID,1),1);
for i = 1:n
    if sum(tmp1==ID(i,1)) == 1
        loc(i) = tmp2(tmp1==ID(i,1));
    end
end
ID = ID(loc==235,:);
n = size(ID,1);

if compmat == 1
    file = [parentpath '/' dataset '/comp_mat_compustat/comp_mat.mat'];
elseif compmat == 2
    file = [parentpath '/' dataset '/comp_mat_orbis/comp_mat.mat'];
else
    file = [parentpath '/' dataset '/comp_mat_hoberg_phillips/comp_mat.mat'];
end
    
load(file)
B = B(ID(:,1),:);
B = B(:,ID(:,1));
B(isnan(B)) = 0;
%%
dm = cell(T,1);
X1 = cell(T,1);
X2 = cell(T,1);
X3 = cell(T,1);
for s = 1:T
    yr = yr1+s-1;
    %---------------------------------
    firm = zeros(n,2);
    %---------------------------------
    file = [parentpath '/' dataset '/balance_sheets/' int2str(yr) '_ID OUTPUT.csv'];
    flid = fopen(file);
    dat1 = textscan(flid,'%s %s','delimiter',';','headerlines',1);
    fclose(flid);
    fid1 = str2double(strtrim(dat1{1}));
    outp = str2double(strtrim(dat1{2}));
    %---------------------------------
    file = [parentpath '/' dataset '/balance_sheets/' int2str(yr-timelag) '_ID RND_STOCK.csv'];
    flid = fopen(file);
    dat2 = textscan(flid,'%s %s','delimiter',';','headerlines',1);
    fclose(flid);
    fid2 = str2double(strtrim(dat2{1}));
    stck = str2double(strtrim(dat2{2}));
    %---------------------------------
    for i = 1:n
        if (sum(fid1==ID(i,1))==1)&&(sum(fid2==ID(i,1))==1)
            firm(i,1) = outp(fid1==ID(i,1))/fac;
            firm(i,2) = stck(fid2==ID(i,1));
        end
    end
    firm(isnan(firm)) = 0;
    %---------------------------------
    % load RND adjacency matrix
    if rndlink == 1
        file = [parentpath '/' dataset '/adjacency_matrix_rand_' num2str(lnkyear) '_years/' int2str(yr) '_adjacency_matrix.mat'];
    else
        file = [parentpath '/' dataset '/adjacency_matrix_' num2str(lnkyear) '_years/' int2str(yr) '_adjacency_matrix.mat'];
    end
    load(file)
    A = A(ID(:,1),:);
    A = A(:,ID(:,1));
    %---------------------------------
    % indicate firms with missing observations
    dm{s} = ones(n,1);
    for i = 1:n
        if (min(firm(i,:))<=0)
            dm{s}(i) = 0;
        end
    end
    %---------------------------------
    % define variables
    X1{s} = firm;
    X2{s} = A*firm;
    X3{s} = B*firm;
end
%%
% drop the first t0 year(s)
dt = cat(1,dm{:});
dt = dt(n*t0+1:end);
T1 = T-t0;
%---------------------------------
% drop firms that appear less than twice
di = (1:n*T1)';
dj = kron(ones(T1,1),(1:n)');
Dn = sparse(di,dj,dt);
dd = sum(Dn,1);
Dn = Dn(:,dd>1); % firms appearing more than once in the panel
np = size(Dn,2);

En = speye(n);
En = En(dd>1,:); % firms appearing more than once in the panel
ID = En*ID;
for s = 1:T
    dm{s} = En*dm{s};
    X1{s} = En*X1{s};
    X2{s} = En*X2{s};
    X3{s} = En*X3{s};
end
%%
D1 = cell(T1,1);
D2 = cell(T1,1);
Yn = cell(T1,1);
Zn = cell(T1,1);
Qn = cell(T1,1);
pid = cell(T1,1);
tid = cell(T1,1);
for s = (t0+1):T
    Da = diag(dm{s});
    Da = Da(sum(Da,2)==1,:);
    Db = zeros(size(Da,1),T1);
    Db(:,s-t0) = 1;
    D1{s-t0} = Da;
    D2{s-t0} = Db;    
    
    Yn{s-t0} = Da*X1{s}(:,1);
    Zn{s-t0} = Da*[X2{s}(:,1),X3{s}(:,1),X1{s}(:,2)];
    Qn{s-t0} = Da*[X2{s}(:,2),X3{s}(:,2),X1{s}(:,2)];
    
    pid{s-t0} = Da*ID(:,1);
    tid{s-t0} = (s-t0)*ones(size(Da,1),1);
end

D1 = cat(1,D1{:});
D2 = cat(1,D2{:});
Yn = cat(1,Yn{:});
Zn = cat(1,Zn{:});
Qn = cat(1,Qn{:});

temp = [cat(1,pid{:}),cat(1,tid{:}),Yn,Zn,Qn];
name = cell(1,size(temp,2));
name{1} = 'pid';
name{2} = 'tid';
name{3} = 'y';
for j = 1:size(Zn,2)
    name{j+3} = ['z' num2str(j)];
end
for j = 1:size(Qn,2)
    name{j+3+size(Zn,2)} = ['q' num2str(j)];
end

filename = ['stata_data3\data_' num2str(compmat) '.csv'];
delete(filename);
csvwrite_with_headers(filename,temp,name);